import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('SpotifyFeatures.csv')
df
| genre | artist_name | track_name | track_id | popularity | acousticness | danceability | duration_ms | energy | instrumentalness | key | liveness | loudness | mode | speechiness | tempo | time_signature | valence | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Movie | Henri Salvador | C'est beau de faire un Show | 0BRjO6ga9RKCKjfDqeFgWV | 0 | 0.61100 | 0.389 | 99373 | 0.910 | 0.000000 | C# | 0.3460 | -1.828 | Major | 0.0525 | 166.969 | 4/4 | 0.814 |
| 1 | Movie | Martin & les fées | Perdu d'avance (par Gad Elmaleh) | 0BjC1NfoEOOusryehmNudP | 1 | 0.24600 | 0.590 | 137373 | 0.737 | 0.000000 | F# | 0.1510 | -5.559 | Minor | 0.0868 | 174.003 | 4/4 | 0.816 |
| 2 | Movie | Joseph Williams | Don't Let Me Be Lonely Tonight | 0CoSDzoNIKCRs124s9uTVy | 3 | 0.95200 | 0.663 | 170267 | 0.131 | 0.000000 | C | 0.1030 | -13.879 | Minor | 0.0362 | 99.488 | 5/4 | 0.368 |
| 3 | Movie | Henri Salvador | Dis-moi Monsieur Gordon Cooper | 0Gc6TVm52BwZD07Ki6tIvf | 0 | 0.70300 | 0.240 | 152427 | 0.326 | 0.000000 | C# | 0.0985 | -12.178 | Major | 0.0395 | 171.758 | 4/4 | 0.227 |
| 4 | Movie | Fabien Nataf | Ouverture | 0IuslXpMROHdEPvSl1fTQK | 4 | 0.95000 | 0.331 | 82625 | 0.225 | 0.123000 | F | 0.2020 | -21.150 | Major | 0.0456 | 140.576 | 4/4 | 0.390 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 232720 | Soul | Slave | Son Of Slide | 2XGLdVl7lGeq8ksM6Al7jT | 39 | 0.00384 | 0.687 | 326240 | 0.714 | 0.544000 | D | 0.0845 | -10.626 | Major | 0.0316 | 115.542 | 4/4 | 0.962 |
| 232721 | Soul | Jr Thomas & The Volcanos | Burning Fire | 1qWZdkBl4UVPj9lK6HuuFM | 38 | 0.03290 | 0.785 | 282447 | 0.683 | 0.000880 | E | 0.2370 | -6.944 | Minor | 0.0337 | 113.830 | 4/4 | 0.969 |
| 232722 | Soul | Muddy Waters | (I'm Your) Hoochie Coochie Man | 2ziWXUmQLrXTiYjCg2fZ2t | 47 | 0.90100 | 0.517 | 166960 | 0.419 | 0.000000 | D | 0.0945 | -8.282 | Major | 0.1480 | 84.135 | 4/4 | 0.813 |
| 232723 | Soul | R.LUM.R | With My Words | 6EFsue2YbIG4Qkq8Zr9Rir | 44 | 0.26200 | 0.745 | 222442 | 0.704 | 0.000000 | A | 0.3330 | -7.137 | Major | 0.1460 | 100.031 | 4/4 | 0.489 |
| 232724 | Soul | Mint Condition | You Don't Have To Hurt No More | 34XO9RwPMKjbvRry54QzWn | 35 | 0.09730 | 0.758 | 323027 | 0.470 | 0.000049 | G# | 0.0836 | -6.708 | Minor | 0.0287 | 113.897 | 4/4 | 0.479 |
232725 rows × 18 columns
df.describe()
| popularity | acousticness | danceability | duration_ms | energy | instrumentalness | liveness | loudness | speechiness | tempo | valence | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 232725.000000 | 232725.000000 | 232725.000000 | 2.327250e+05 | 232725.000000 | 232725.000000 | 232725.000000 | 232725.000000 | 232725.000000 | 232725.000000 | 232725.000000 |
| mean | 41.127502 | 0.368560 | 0.554364 | 2.351223e+05 | 0.570958 | 0.148301 | 0.215009 | -9.569885 | 0.120765 | 117.666585 | 0.454917 |
| std | 18.189948 | 0.354768 | 0.185608 | 1.189359e+05 | 0.263456 | 0.302768 | 0.198273 | 5.998204 | 0.185518 | 30.898907 | 0.260065 |
| min | 0.000000 | 0.000000 | 0.056900 | 1.538700e+04 | 0.000020 | 0.000000 | 0.009670 | -52.457000 | 0.022200 | 30.379000 | 0.000000 |
| 25% | 29.000000 | 0.037600 | 0.435000 | 1.828570e+05 | 0.385000 | 0.000000 | 0.097400 | -11.771000 | 0.036700 | 92.959000 | 0.237000 |
| 50% | 43.000000 | 0.232000 | 0.571000 | 2.204270e+05 | 0.605000 | 0.000044 | 0.128000 | -7.762000 | 0.050100 | 115.778000 | 0.444000 |
| 75% | 55.000000 | 0.722000 | 0.692000 | 2.657680e+05 | 0.787000 | 0.035800 | 0.264000 | -5.501000 | 0.105000 | 139.054000 | 0.660000 |
| max | 100.000000 | 0.996000 | 0.989000 | 5.552917e+06 | 0.999000 | 0.999000 | 1.000000 | 3.744000 | 0.967000 | 242.903000 | 1.000000 |
plt.figure(figsize=(12,10))
sns.histplot(data=df, x='genre',color='c')
plt.xticks(rotation=70);
plt.figure(figsize=(12,10))
sns.histplot(data=df, x='key')
<AxesSubplot:xlabel='key', ylabel='Count'>
fig = plt.figure(figsize = (15,20));
ax = fig.gca();
df.hist(ax = ax,bins=10);
C:\Users\annam\AppData\Local\Temp\ipykernel_31132\2283546626.py:3: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared. df.hist(ax = ax,bins=10);
df.columns
Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
'acousticness', 'danceability', 'duration_ms', 'energy',
'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
'speechiness', 'tempo', 'time_signature', 'valence'],
dtype='object')
df.sort_values('popularity',ascending=False)[['track_name','artist_name','popularity']].drop_duplicates()[0:15]
| track_name | artist_name | popularity | |
|---|---|---|---|
| 9027 | 7 rings | Ariana Grande | 100 |
| 86951 | Wow. | Post Malone | 99 |
| 107802 | break up with your girlfriend, i'm bored | Ariana Grande | 99 |
| 66643 | Con Calma | Daddy Yankee | 98 |
| 107829 | Sweet but Psycho | Ava Max | 97 |
| 86953 | Sunflower - Spider-Man: Into the Spider-Verse | Post Malone | 97 |
| 107875 | Calma - Remix | Pedro Capó | 97 |
| 92824 | Without Me | Halsey | 97 |
| 107812 | Happier | Marshmello | 97 |
| 107810 | Dancing With A Stranger (with Normani) | Sam Smith | 97 |
| 107851 | Taki Taki (with Selena Gomez, Ozuna & Cardi B) | DJ Snake | 96 |
| 107830 | Shallow | Lady Gaga | 96 |
| 86952 | MIDDLE CHILD | J. Cole | 96 |
| 138916 | Secreto | Anuel Aa | 96 |
| 138917 | Baila Baila Baila | Ozuna | 95 |
order = df.groupby(["genre"])["popularity"].mean().sort_values(ascending=False).index
sns.catplot(data=df, x='genre', y='popularity', height=5, aspect=2, kind='box',order=order)
plt.xticks(rotation=70);
for col in df.select_dtypes(include=['object','category']):
print('%s\n%s\n' % (df[col].name, df[col].unique()))
genre ['Movie' 'R&B' 'A Capella' 'Alternative' 'Country' 'Dance' 'Electronic' 'Anime' 'Folk' 'Blues' 'Opera' 'Hip-Hop' "Children's Music" 'Children’s Music' 'Rap' 'Indie' 'Classical' 'Pop' 'Reggae' 'Reggaeton' 'Jazz' 'Rock' 'Ska' 'Comedy' 'Soul' 'Soundtrack' 'World'] artist_name ['Henri Salvador' 'Martin & les fées' 'Joseph Williams' ... 'Dharmasoul' 'Swim' 'Jr Thomas & The Volcanos'] track_name ["C'est beau de faire un Show" "Perdu d'avance (par Gad Elmaleh)" "Don't Let Me Be Lonely Tonight" ... 'P.O.P.' 'Burning Fire' "You Don't Have To Hurt No More"] track_id ['0BRjO6ga9RKCKjfDqeFgWV' '0BjC1NfoEOOusryehmNudP' '0CoSDzoNIKCRs124s9uTVy' ... '2iZf3EUedz9MPqbAvXdpdA' '1qWZdkBl4UVPj9lK6HuuFM' '34XO9RwPMKjbvRry54QzWn'] key ['C#' 'F#' 'C' 'F' 'G' 'E' 'D#' 'G#' 'D' 'A#' 'A' 'B'] mode ['Major' 'Minor'] time_signature ['4/4' '5/4' '3/4' '1/4' '0/4']
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), cmap='twilight_shifted',annot=True,)
<AxesSubplot:>
df['popularity1'] = df.apply(lambda x: 0 if x['popularity']<=45 else 1 if x['popularity']<=60 else 2 , axis=1)
sns.countplot(df['popularity1'],palette='pastel').set_xticklabels(['0-45', '46-60','61 & more']);
C:\Users\annam\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
X0 = df[['genre','acousticness', 'danceability', 'duration_ms', 'energy',
'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
'speechiness', 'tempo', 'time_signature', 'valence']]
y = df['popularity1']
X = pd.get_dummies(X0)
X.head()
| acousticness | danceability | duration_ms | energy | instrumentalness | liveness | loudness | speechiness | tempo | valence | ... | key_F# | key_G | key_G# | mode_Major | mode_Minor | time_signature_0/4 | time_signature_1/4 | time_signature_3/4 | time_signature_4/4 | time_signature_5/4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.611 | 0.389 | 99373 | 0.910 | 0.000 | 0.3460 | -1.828 | 0.0525 | 166.969 | 0.814 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 0.246 | 0.590 | 137373 | 0.737 | 0.000 | 0.1510 | -5.559 | 0.0868 | 174.003 | 0.816 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 0.952 | 0.663 | 170267 | 0.131 | 0.000 | 0.1030 | -13.879 | 0.0362 | 99.488 | 0.368 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 3 | 0.703 | 0.240 | 152427 | 0.326 | 0.000 | 0.0985 | -12.178 | 0.0395 | 171.758 | 0.227 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 0.950 | 0.331 | 82625 | 0.225 | 0.123 | 0.2020 | -21.150 | 0.0456 | 140.576 | 0.390 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 56 columns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
pipe = Pipeline([
("classifier", RandomForestClassifier())])
sp = [
{"classifier": [RandomForestClassifier(),xgb.XGBClassifier()],
"classifier__n_estimators": [100,120]},
{"classifier": [LogisticRegression()],
"classifier__C": [np.logspace(0,4,10)]}
]
grid = GridSearchCV(pipe, sp, cv=3, verbose=0)
ml_g = grid.fit(X_train,y_train)
C:\Users\annam\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
3 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\annam\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\annam\anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "C:\Users\annam\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1464, in fit
raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
ValueError: Penalty term must be positive; got (C=array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
3.59381366e+03, 1.00000000e+04]))
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\annam\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:969: UserWarning: One or more of the test scores are non-finite: [0.75956064 0.75983994 0.76597916 0.76766033 nan]
warnings.warn(
ml_g.best_params_
{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=120, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...),
'classifier__n_estimators': 120}
p2 = ml_g.predict(X_test)
accuracy_score(y_test,p2)
0.7672145235793318
y_train.value_counts()
0 102193 1 58057 2 25930 Name: popularity1, dtype: int64
ml3 = LogisticRegression(solver='sag')
ml3.fit(X_train,y_train)
p3 = ml3.predict(X_test)
accuracy_score(p3,y_test)
C:\Users\annam\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
0.7604683639488667
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
df_a = df.sample(frac=0.01)
X0_a = df_a[['artist_name','genre','acousticness', 'danceability', 'duration_ms', 'energy',
'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
'speechiness', 'tempo', 'time_signature', 'valence']]
y_a = df_a['popularity1']
X_a = pd.get_dummies(X0_a)
X_a.head()
| acousticness | danceability | duration_ms | energy | instrumentalness | liveness | loudness | speechiness | tempo | valence | ... | key_F | key_F# | key_G | key_G# | mode_Major | mode_Minor | time_signature_1/4 | time_signature_3/4 | time_signature_4/4 | time_signature_5/4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 60497 | 0.487 | 0.511 | 344320 | 0.391 | 0.000000 | 0.1120 | -10.850 | 0.0443 | 112.196 | 0.580 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 46728 | 0.167 | 0.481 | 189293 | 0.689 | 0.001530 | 0.7390 | -11.629 | 0.2140 | 144.851 | 0.782 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 50849 | 0.388 | 0.770 | 215493 | 0.565 | 0.000179 | 0.0846 | -7.185 | 0.0333 | 114.069 | 0.266 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 45140 | 0.264 | 0.458 | 235495 | 0.930 | 0.000353 | 0.2250 | -4.303 | 0.0983 | 170.907 | 0.583 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
| 25773 | 0.566 | 0.706 | 304333 | 0.520 | 0.857000 | 0.1580 | -7.496 | 0.0290 | 108.994 | 0.354 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
5 rows × 1746 columns
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_a, y_a, test_size=0.2, random_state=42)
preprocess = FeatureUnion([("std", StandardScaler()), ("pca",PCA())])
pipe_pca = Pipeline([
("preprocess", preprocess),
("classifier", xgb.XGBClassifier())])
search_p = [{"preprocess__pca__n_components": [100],
"classifier__n_estimators": [120]}]
grid_pca = GridSearchCV(pipe_pca, param_grid=search_p, cv=3)
mg2 = grid_pca.fit(X_train1,y_train1)
mg2.best_params_
{'classifier__n_estimators': 120, 'preprocess__pca__n_components': 100}
pp = mg2.predict(X_test1)
accuracy_score(y_test1,pp)
0.7253218884120172
mg2.best_estimator_.named_steps['preprocess']
FeatureUnion(transformer_list=[('std', StandardScaler()),
('pca', PCA(n_components=500))])
mg2.best_estimator_.get_params()['preprocess__transformer_list']
[('std', StandardScaler()), ('pca', PCA(n_components=500))]
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_a, y_a, test_size=0.2, random_state=42)
preprocess_no_pca = FeatureUnion([("std", StandardScaler())])
pipe_no_pca = Pipeline([
("preprocess_no_pca", preprocess_no_pca),
("classifier", xgb.XGBClassifier())])
search_no_p = [{#"preprocess__pca__n_components": [100],
"classifier__n_estimators": [120]}]
grid_no_pca = GridSearchCV(pipe_no_pca, param_grid=search_no_p, cv=3)
mg_npca = grid_no_pca.fit(X_train1,y_train1)
pp1 = mg_npca.predict(X_test1)
accuracy_score(y_test1,pp1)
0.6974248927038627
from sklearn.cluster import AgglomerativeClustering
sample1 = df.sample(frac=0.01)
sample2 = sample1[['acousticness', 'danceability', 'duration_ms', 'energy',
'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
'valence', 'popularity']]
sample2_s = scaler.fit_transform(sample2)
cluAgg = AgglomerativeClustering()
cluAgg_m = cluAgg.fit(sample2_s)
sample1['cluAgg'] = cluAgg_m.labels_
sample2['cluAgg'] = cluAgg_m.labels_
C:\Users\annam\AppData\Local\Temp\ipykernel_33544\130059744.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy sample2['cluAgg'] = cluAgg_m.labels_
import plotly.express as px
fig = px.scatter_3d(sample1, x='acousticness', y='danceability', z='popularity',
color='cluAgg', size='popularity', size_max=30,hover_data=sample1,
symbol='cluAgg', opacity=1)
# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig = plt.figure(figsize = (15,3));
order = (sample1[sample1.cluAgg==0]).groupby(["genre"])["genre"].count().sort_values(ascending=False).index
sns.countplot(data=sample1[sample1.cluAgg==0],x='genre',order=order);
plt.xticks(rotation=70);
plt.title("Agglomerative Clustering - 0")
Text(0.5, 1.0, 'Agglomerative Clustering - 0')
fig = plt.figure(figsize = (15,3));
order = (sample1[sample1.cluAgg==1]).groupby(["genre"])["genre"].count().sort_values(ascending=False).index
sns.countplot(data=sample1[sample1.cluAgg==1],x='genre',order=order);
plt.xticks(rotation=70);
plt.title("Agglomerative Clustering - 1")
Text(0.5, 1.0, 'Agglomerative Clustering - 1')
from sklearn.cluster import KMeans
clus_k = KMeans(2)
clus_k_m = clus_k.fit(sample2_s)
sample1['kmeans'] = clus_k_m.labels_
sample1['kmeans'] = clus_k_m.labels_
fig = px.scatter_3d(sample1, x='acousticness', y='danceability', z='popularity',
color='kmeans', size='popularity', size_max=30,hover_data=sample1,
symbol='kmeans', opacity=1)
# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig = plt.figure(figsize = (15,3));
order = (sample1[sample1.kmeans==0]).groupby(["genre"])["genre"].count().sort_values(ascending=False).index
sns.countplot(data=sample1[sample1.kmeans==0],x='genre',order=order);
plt.xticks(rotation=70);
plt.title("Kmeans Clustering - 0")
Text(0.5, 1.0, 'Kmeans Clustering - 0')
fig = plt.figure(figsize = (15,3));
order = (sample1[sample1.kmeans==1]).groupby(["genre"])["genre"].count().sort_values(ascending=False).index
sns.countplot(data=sample1[sample1.kmeans==1],x='genre',order=order);
plt.xticks(rotation=70);
plt.title("Kmeans Clustering - 1")
Text(0.5, 1.0, 'Kmeans Clustering - 1')